Philadelphia Bikeshare Rebalancing Prediction

MUSA 508, Lab #5

Author

Nissim Lebovits

Published

November 13, 2023

1 Summary

2 Introduction

Show the code
library(tidyverse)
library(sf)
library(lubridate)
library(tigris)
library(tidycensus)
library(viridis)
library(riem)
library(gridExtra)
library(knitr)
library(kableExtra)
library(RSocrata)
library(tidyverse)
library(sf)
library(tmap)
library(sfdep)
library(gganimate)


options(tmap.mode = 'plot', scipen = 999)

tmap_options(basemaps = "Esri.WorldGrayCanvas") #set global tmap basemap

crs <- "epsg:2272"

plotTheme <- theme(
  plot.title =element_text(size=12),
  plot.subtitle = element_text(size=8),
  plot.caption = element_text(size = 6),
  axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
  axis.text.y = element_text(size = 10),
  axis.title.y = element_text(size = 10),
  # Set the entire chart region to blank
  panel.background=element_blank(),
  plot.background=element_blank(),
  #panel.border=element_rect(colour="#F0F0F0"),
  # Format the grid
  panel.grid.major=element_line(colour="#D0D0D0",size=.2),
  axis.ticks=element_blank())

mapTheme <- theme(plot.title =element_text(size=12),
                  plot.subtitle = element_text(size=8),
                  plot.caption = element_text(size = 6),
                  axis.line=element_blank(),
                  axis.text.x=element_blank(),
                  axis.text.y=element_blank(),
                  axis.ticks=element_blank(),
                  axis.title.x=element_blank(),
                  axis.title.y=element_blank(),
                  panel.background=element_blank(),
                  panel.border=element_blank(),
                  panel.grid.major=element_line(colour = 'transparent'),
                  panel.grid.minor=element_blank(),
                  legend.direction = "vertical", 
                  legend.position = "right",
                  plot.margin = margin(1, 1, 1, 1, 'cm'),
                  legend.key.height = unit(1, "cm"), legend.key.width = unit(0.2, "cm"))

palette6 <- c("#eff3ff", "#bdd7e7", "#6baed6", "#3182bd", "#08519c", "#063970")
palette5 <- c("#eff3ff","#bdd7e7","#6baed6","#3182bd","#08519c")
palette4 <- c("#D2FBD4","#92BCAB","#527D82","#123F5A")
palette2 <- c("#6baed6","#08519c")



stations_path <- "indego_data/indego-trips-2022-q4.csv"

suppressMessages(
stations <- read_csv(stations_path) %>%
  filter(!is.na(start_lon),
         !is.na(start_lat),
         !is.na(end_lon),
         !is.na(end_lat)) %>%
  st_as_sf(coords = c('start_lon', 'start_lat'), crs = 4326) %>%
  st_transform(crs = crs) %>%
                mutate(date = as.Date(strptime(start_time, "%m/%d/%Y %H:%M")),
                       week = week(date),
                       dotw = wday(date),
                       interval60 = as.POSIXct(start_time, format = "%m/%d/%Y %H"))
)



### PHL bounds and grid----------------------------------------
phl_path <- "https://opendata.arcgis.com/datasets/405ec3da942d4e20869d4e1449a2be48_0.geojson"
phl <- st_read(phl_path, quiet = TRUE) %>%
          st_transform(crs = crs)

### GET ACS

phlCensus <- get_acs(geography = "tract", 
                          variables = c("B01003_001", 
                                        "B19013_001", 
                                        "B02001_002", 
                                        "B08013_001",
                                        "B08012_001", 
                                        "B08301_001", 
                                        "B08301_010", 
                                        "B01002_001"), 
                          year = 2021, 
                          state = "PA", 
                          geometry = TRUE, 
                          county="Philadelphia",
                          output = "wide") %>%
                  rename(Total_Pop =  B01003_001E,
                         Med_Inc = B19013_001E,
                         Med_Age = B01002_001E,
                         White_Pop = B02001_002E,
                         Travel_Time = B08013_001E,
                         Num_Commuters = B08012_001E,
                         Means_of_Transport = B08301_001E,
                         Total_Public_Trans = B08301_010E) %>%
                  select(Total_Pop, Med_Inc, White_Pop, Travel_Time,
                         Means_of_Transport, Total_Public_Trans,
                         Med_Age,
                         GEOID, geometry) %>%
                  mutate(Percent_White = White_Pop / Total_Pop,
                         Mean_Commute_Time = Travel_Time / Total_Public_Trans,
                         Percent_Taking_Public_Trans = Total_Public_Trans / Means_of_Transport) %>%
                  mutate(
                      nb = st_knn(geometry, 5), # need to impute census values for tracts with pop ~ 0
                      wt = st_weights(nb),
                      Med_Age = ifelse(is.na(Med_Age), purrr::map_dbl(find_xj(Med_Age, nb), mean, na.rm = TRUE), Med_Age),
                      Med_Inc = ifelse(is.na(Med_Inc), purrr::map_dbl(find_xj(Med_Inc, nb), mean, na.rm = TRUE), Med_Inc),
                      Travel_Time = ifelse(is.na(Travel_Time), purrr::map_dbl(find_xj(Travel_Time, nb), mean, na.rm = TRUE), Travel_Time),
                      Percent_White = ifelse(is.na(Percent_White), purrr::map_dbl(find_xj(Percent_White, nb), mean, na.rm = TRUE), Percent_White),
                      Mean_Commute_Time = ifelse(is.na(Mean_Commute_Time), purrr::map_dbl(find_xj(Mean_Commute_Time, nb), mean, na.rm = TRUE), Mean_Commute_Time),
                      Percent_Taking_Public_Trans = ifelse(is.na(Percent_Taking_Public_Trans), purrr::map_dbl(find_xj(Percent_Taking_Public_Trans, nb), mean, na.rm = TRUE), Percent_Taking_Public_Trans))

phlTracts <- 
  phlCensus %>%
  as.data.frame() %>%
  distinct(GEOID, .keep_all = TRUE) %>%
  select(GEOID, geometry) %>% 
  st_sf %>%
  st_transform(crs = crs)


dat_census <- st_join(stations, phlTracts, join = st_intersects, left = TRUE) %>%
                      rename(Origin.Tract = GEOID) %>%
                      mutate(start_lon = unlist(map(geometry, 1)),
                             start_lat = unlist(map(geometry, 2))) %>%
                      as.data.frame() %>%
                      select(-geometry) %>%
                      st_as_sf(., coords = c("end_lon", "end_lat"), crs = crs) %>%
                      st_join(., phlTracts %>%
                                st_transform(crs=crs),
                              join=st_intersects,
                              left = TRUE) %>%
                      rename(Destination.Tract = GEOID)  %>%
                      mutate(end_lon = unlist(map(geometry, 1)),
                             end_lat = unlist(map(geometry, 2)))%>%
                      as.data.frame() %>%
                      select(-geometry)


### import weather--------------------------------------------------------------
weather.Panel <- 
  riem_measures(station = "PHL", date_start = "2022-10-01", date_end = "2023-01-05") %>%
  dplyr::select(valid, tmpf, p01i, sknt)%>%
  replace(is.na(.), 0) %>%
    mutate(date = as.Date(substr(valid,1,13)),
           week = week(date),
           dotw = wday(date),
           interval60 = ymd_h(substr(valid,1,13)))%>%
    group_by(interval60) %>%
    summarize(Temperature = max(tmpf),
              Precipitation = sum(p01i),
              Wind_Speed = max(sknt)) %>%
    mutate(Temperature = ifelse(Temperature == 0, 42, Temperature))


weekend <- c(1, 7)

Bike sharing systems have emerged as a popular, eco-friendly transportation alternative in cities worldwide. These systems provide a network of bicycles and docking stations, allowing users to pick up a bike from one location and drop it off at another, facilitating short, point-to-point trips. However, one significant challenge in these systems is maintaining a balanced distribution of bikes across the network. This is where bike re-balancing becomes crucial.

Due to daily commuting patterns and other socio-economic factors, certain areas in a city may experience a surplus of bikes while others face a shortage. For instance, residential areas might see a high number of bikes in the morning as people commute to work, leaving these stations depleted by evening. Conversely, business districts may have an excess of bikes during the day, which need to be redistributed for the evening commute home. Without effective re-balancing, the utility and efficiency of the bike share system are significantly reduced, leading to customer dissatisfaction and a decline in usage.

Below, we propose and evaluate a rebalancing strategy for Indego, the bike share system in Philadelphia, based on a spatiotemporal predictive model. Launched in 2015, Indego has 1,400 bikes distributed across 130 stations, mostly concentrated in Center City, West Philadelphia, and South Philadelphia.1 Based on data from the 4th quarter of 2022, as well as supplemental weather data, we propose a predictive model optimized for overnight rebalancing with trucks, which aims to minimize expected customer dissatisfaction the next day.2 Based on our predictions, an integer program could be used to optimize redistribution routes.3

Show the code
ggplot() +
  geom_sf(data = phlTracts) +
  geom_sf(data = stations %>%
            group_by(start_station) %>%
            summarize(count = n())) +
  labs(title = "Distribution of Indego Stations in Philadelphia",
       subtitle = "December 2022")+
  mapTheme

3 Methods

3.1 Data

3.1.1 Overview

For our analysis, we rely on three datasets: 1) Indego station and ridership data from Q4 of 2022, 2) weather data collected at Philadelphia International Airport during those same dates and accessed via the reim package in R, and 3) American Community Survey data from 2021.

3.1.2 Exploratory Analysis

Examining our bike share data, we notice clear temporal trends: peaks during certain hours of the day and a longer-time decline in trips across the quarter.

Show the code
ggplot(dat_census %>%
         group_by(interval60, start_station) %>%
         tally())+
  geom_histogram(aes(n), binwidth = 5)+
  labs(title="Bike share trips per hr by station. Philadelphia, Oct. - Dec., 2022",
       x="Trip Counts", 
       y="Number of Stations")+
  plotTheme

The afternoon rush hour, for example, is associated with more stations with a higher number of trips.

Likewise, total bike counts across the city per hour peak during the morning and afternoon rush hours. We also observe that total ridership is meaningfully lower on weekends than during the week.

Show the code
dat_census %>% mutate(hour = hour(interval60)) %>%
ggplot(aes(x = hour, color = as.factor(dotw)))+
     geom_freqpoly(binwidth = 1)+
  labs(title="Bike share trips in Philadelphia, by day of the week, Oct. - Dec., 2022",
       x="Hour", 
       y="Trip Counts")+
     plotTheme

Show the code
weekend <- c(1, 7)

ggplot(dat_census %>% 
         mutate(hour = hour(interval60),
                weekend = ifelse(dotw %in% weekend, "Weekend", "Weekday")))+
     geom_freqpoly(aes(x = hour, color = weekend), binwidth = 1)+
  labs(title="Bike share trips in Philadelphia, Oct. - Dec., 2022, \nweekend vs. weekday",
       x="Hour", 
       y="Trip Counts")+
     plotTheme

Show the code
ggplot(dat_census %>%
         group_by(interval60) %>%
         tally())+
  geom_line(aes(x = interval60, y = n))+
  labs(title="Bike share trips per hr. Philadelphia, Oct. - Dec., 2022",
       x="Date", 
       y="Number of trips")+
  plotTheme

Show the code
dat_census %>%
        mutate(time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
         group_by(interval60, start_station, time_of_day) %>%
         tally()%>%
  group_by(start_station, time_of_day)%>%
  summarize(mean_trips = mean(n))%>%
  ggplot()+
  geom_density(aes(mean_trips, color = time_of_day), fill = NA, alpha = 0.3)+
  labs(title="Mean Number of Hourly Trips Per Station. Philadelphia, Oct. - Dec., 2022",
       x="Number of trips", 
       y="Density",
       color = "Time of Day")+
 # facet_wrap(~time_of_day)+
  plotTheme

These trends are born out spatially, as we observe that the highest ridership per station clusters in the downtown and University City areas during the afternoon rush hour.

Show the code
ggplot()+
  geom_sf(data = phlTracts %>%
          st_transform(crs=crs))+
  geom_point(data = dat_census %>% 
            mutate(hour = hour(interval60),
                weekend = ifelse(dotw > 5, "Weekend", "Weekday"),
                time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
              group_by(start_station, start_lat, start_lon, weekend, time_of_day) %>%
              tally(),
            aes(x=start_lon, y = start_lat, color = n), 
            fill = "transparent", alpha = 0.7, size = 0.7)+
  scale_colour_viridis(direction = -1,
  discrete = FALSE, option = "D")+
  ylim(min(dat_census$start_lat), max(dat_census$start_lat))+
  xlim(min(dat_census$start_lon), max(dat_census$start_lon))+
  facet_grid(weekend ~ time_of_day)+
  labs(title="Bike share trips per hr by station. Philadelphia, Oct. - Dec., 2022")+
  mapTheme

It is likely that this decrease from October through December is related to dropping temperatures, accounted for in our weather data:

Show the code
grid.arrange(
  ggplot(weather.Panel, aes(interval60,Precipitation)) + geom_line() + 
  labs(title="Precipitation", x="Hour", y="Precipitation") + plotTheme,
  ggplot(weather.Panel, aes(interval60,Wind_Speed)) + geom_line() + 
    labs(title="Wind Speed", x="Hour", y="Wind Speed") + plotTheme,
  ggplot(weather.Panel, aes(interval60,Temperature)) + geom_line() + 
    labs(title="Temperature", x="Hour", y="Temperature") + plotTheme,
  top="Weather Data - Philadelphia PHL - Oct. - Dec., 2022")

Show the code
study.panel <- 
  expand.grid(interval60=unique(dat_census$interval60), 
              start_station = unique(dat_census$start_station)) %>%
  left_join(., dat_census %>%
              select(start_station, Origin.Tract, start_lon, start_lat)%>%
              distinct() %>%
              group_by(start_station) %>%
              slice(1))

ride.panel <- 
  dat_census %>%
  mutate(Trip_Counter = 1) %>%
  right_join(study.panel) %>% 
  group_by(interval60, start_station, Origin.Tract, start_lon, start_lat) %>%
  summarize(Trip_Count = sum(Trip_Counter, na.rm=T)) %>%
  left_join(weather.Panel, by = "interval60") %>%
  ungroup() %>%
  filter(!is.na(start_station)) %>%
  mutate(week = week(interval60),
         dotw = wday(interval60)) %>%
  filter(!is.na(Origin.Tract))

ride.panel <- ride.panel %>%
                st_as_sf(coords = c("start_lon", "start_lat"), crs = crs) %>%
                mutate(start_lon = as.numeric(st_coordinates(.)[,1]),
                       start_lat = as.numeric(st_coordinates(.)[,2])) %>%
                st_join((phlCensus %>% st_transform(crs = crs)))

# holidays are nov 23, dec 23, 24, 25, 30, 31
# how do I return the yday for the dates above?
holidays_22 <- c(327, 148, 357, 358, 359, 364, 365)


ride.panel <- 
  ride.panel %>% 
  arrange(start_station, interval60) %>% 
  mutate(lagHour = dplyr::lag(Trip_Count, 1),
         lag2Hours = dplyr::lag(Trip_Count, 2),
         lag3Hours = dplyr::lag(Trip_Count, 3),
         lag4Hours = dplyr::lag(Trip_Count, 4),
         lag12Hours = dplyr::lag(Trip_Count, 12),
         lag1day = dplyr::lag(Trip_Count,24),
         holiday = ifelse(yday(interval60) %in% holidays_22, 1, 0)) %>%
   mutate(day = yday(interval60)) %>%
   mutate(holidayLag = case_when(dplyr::lag(holiday, 1) == 1 ~ "PlusOneDay",
                                 dplyr::lag(holiday, 2) == 1 ~ "PlustTwoDays",
                                 dplyr::lag(holiday, 3) == 1 ~ "PlustThreeDays",
                                 dplyr::lead(holiday, 1) == 1 ~ "MinusOneDay",
                                 dplyr::lead(holiday, 2) == 1 ~ "MinusTwoDays",
                                 dplyr::lead(holiday, 3) == 1 ~ "MinusThreeDays",
                                TRUE ~ "Zero")) %>%
  mutate(nb = st_knn(st_jitter(geometry), 3),
         wt = st_weights(nb),
         Trip_Count_Lag = st_lag(Trip_Count, nb, wt))

as.data.frame(ride.panel) %>%
    group_by(interval60) %>% 
    summarise_at(vars(starts_with("lag"), "Trip_Count"), mean, na.rm = TRUE) %>%
    gather(Variable, Value, -interval60, -Trip_Count) %>%
    mutate(Variable = factor(Variable, levels=c("lagHour","lag2Hours","lag3Hours","lag4Hours",
                                                "lag12Hours","lag1day")))%>%
    group_by(Variable) %>%  
    summarize(correlation = round(cor(Value, Trip_Count),2)) %>%
    kbl() %>%
  kable_paper(bootstrap_options = "striped", full_width = F)
Variable correlation
lagHour 0.90
lag2Hours 0.75
lag3Hours 0.58
lag4Hours 0.42
lag12Hours -0.31
lag1day 0.83

Finally, we note clear spatial and temporal autocorrelation in our data: this map, for example, indicates that data cluster at certain times

Show the code
ride.sum <- ride.panel %>%
                mutate(date = ymd(as.Date(interval60))) %>%
                group_by(date, start_station) %>%
                summarize(count = sum(Trip_Count)) %>% 
                  ungroup() %>%
                  mutate(start_lon = as.numeric(st_coordinates(.)[,1]),
                         start_lat = as.numeric(st_coordinates(.)[,2])) %>%
                st_drop_geometry()

p <- ggplot() +
      geom_sf(data = phlTracts) + 
      geom_point(data = st_drop_geometry(ride.sum),
                 aes(x = start_lon,
                     y = start_lat,
                     color = count),
  fill = "transparent", alpha = 0.4)+
  scale_colour_viridis(direction = 1,
  discrete = FALSE, option = "D") +
  transition_states(date,
                    transition_length = 5, 
                    state_length = 1) +
  enter_fade() +
  exit_fade() +
  labs(title = "Station Usage in Philadelphia, Q4 2022",
       subtitle = "Day of year: {closest_state}",
       color = "Trip Count") +
  mapTheme

animate(p)

3.2 Spatiotemporal Prediction

Use purrr to train and validate several models for comparison on the latter two week test set. Perform either random k-fold cross validation or LOGO-CV on the 5 week panel. You may choose to cross validate by time or space. Interpret your findings in the context of accuracy and generalizability.

Show the code
ride.Train <- filter(ride.panel, week >= 47 | week == 1)
ride.Test <- filter(ride.panel, week < 47 & week != 1)

head(ride.panel %>% filter(week < 26))

reg1 <- 
  lm(Trip_Count ~  hour(interval60) + dotw + Temperature,  data=ride.Train)

reg2 <- 
  lm(Trip_Count ~  start_station + dotw + Temperature,  data=ride.Train)

reg3 <- 
  lm(Trip_Count ~  start_station + hour(interval60) + dotw + Temperature + Precipitation, 
     data=ride.Train)

reg4 <- 
  lm(Trip_Count ~  start_station +  hour(interval60) + dotw + Temperature + Precipitation +
                   lagHour + lag2Hours +lag3Hours + lag12Hours + lag1day, 
     data=ride.Train)

reg5 <- 
  lm(Trip_Count ~  start_station + hour(interval60) + dotw + Temperature + Precipitation +
                   lagHour + lag2Hours +lag3Hours +lag12Hours + lag1day + holidayLag + holiday, 
     data=ride.Train)

reg6 <- 
  lm(Trip_Count ~  start_station + hour(interval60) + dotw + Temperature + Precipitation +
                   lagHour + lag2Hours +lag3Hours +lag12Hours + lag1day + holidayLag + holiday + Trip_Count_Lag, 
     data=ride.Train)
Show the code
ride.Test.weekNest <- 
  ride.Test %>%
  nest(-week) 

model_pred <- function(dat, fit){
   pred <- predict(fit, newdata = dat)}

week_predictions <- 
  ride.Test.weekNest %>% 
    mutate(ATime_FE = map(.x = data, fit = reg1, .f = model_pred),
           BSpace_FE = map(.x = data, fit = reg2, .f = model_pred),
           CTime_Space_FE = map(.x = data, fit = reg3, .f = model_pred),
           DTime_Space_FE_timeLags = map(.x = data, fit = reg4, .f = model_pred),
           ETime_Space_FE_timeLags_holidayLags = map(.x = data, fit = reg5, .f = model_pred),
           FTime_Space_FE_timeLags_holidayLags_tripCountLags = map(.x = data, fit = reg5, .f = model_pred)) %>% 
    gather(Regression, Prediction, -data, -week) %>%
    mutate(Observed = map(data, pull, Trip_Count),
           Absolute_Error = map2(Observed, Prediction,  ~ abs(.x - .y)),
           MAE = map_dbl(Absolute_Error, mean, na.rm = TRUE),
           sd_AE = map_dbl(Absolute_Error, sd, na.rm = TRUE))

4 Results

Interestingly, time seems more important to models than space

Show the code
week_predictions %>%
  dplyr::select(week, Regression, MAE) %>%
  gather(Variable, MAE, -Regression, -week) %>%
  ggplot(aes(week, MAE)) + 
    geom_bar(aes(fill = Regression), position = "dodge", stat="identity") +
    scale_fill_manual(values = palette6) +
    labs(title = "Mean Absolute Errors by model specification and week") +
  plotTheme

Show the code
week_predictions %>% 
    mutate(interval60 = map(data, pull, interval60),
           start_station = map(data, pull, start_station)) %>%
    dplyr::select(interval60, start_station, Observed, Prediction, Regression) %>%
    unnest() %>%
    gather(Variable, Value, -Regression, -interval60, -start_station) %>%
    group_by(Regression, Variable, interval60) %>%
    summarize(Value = sum(Value)) %>%
    ggplot(aes(interval60, Value, colour=Variable)) + 
      geom_line(size = 1.1) + 
      facet_wrap(~Regression, ncol=1) +
      labs(title = "Predicted/Observed bike share time series", subtitle = "Philadelphia; A test set of 3 Months",  x = "Hour", y= "Station Trips") +
      plotTheme

Show the code
week_predictions %>% 
    mutate(interval60 = map(data, pull, interval60),
           start_station = map(data, pull, start_station), 
           start_lat = map(data, pull, start_lat), 
           start_lon = map(data, pull, start_lon)) %>%
    select(interval60, start_station, start_lon, start_lat, Observed, Prediction, Regression) %>%
    unnest() %>%
  filter(Regression == "FTime_Space_FE_timeLags_holidayLags_tripCountLags") %>%
  group_by(start_station, start_lon, start_lat) %>%
  summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
ggplot(.)+
  geom_sf(data = phlTracts %>%
          st_transform(crs=crs), color = "grey", fill = "transparent")+
  geom_point(aes(x = start_lon, y = start_lat, color = MAE), 
             fill = "transparent", alpha = 0.4)+
  scale_colour_viridis(direction = 1,
  discrete = FALSE, option = "D")+
  ylim(min(dat_census$start_lat), max(dat_census$start_lat))+
  xlim(min(dat_census$start_lon), max(dat_census$start_lon))+
  labs(title="Mean Abs Error, Test Set, Model 6")+
  mapTheme

Show the code
week_predictions %>% 
    mutate(interval60 = map(data, pull, interval60),
           start_station = map(data, pull, start_station), 
           start_lat = map(data, pull, start_lat), 
           start_lon = map(data, pull, start_lon),
           dotw = map(data, pull, dotw)) %>%
    select(interval60, start_station, start_lon, 
           start_lat, Observed, Prediction, Regression,
           dotw) %>%
    unnest(cols = c(interval60, start_station, start_lon, start_lat, Observed, Prediction, dotw)) %>%
  filter(Regression == "FTime_Space_FE_timeLags_holidayLags_tripCountLags")%>%
  mutate(weekend = ifelse(dotw %in% weekend, "Weekend", "Weekday"),
         time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush")) %>%
  ggplot()+
  geom_point(aes(x= Observed, y = Prediction))+
    geom_smooth(aes(x= Observed, y= Prediction), method = "lm", se = FALSE, color = "red")+
    geom_abline(slope = 1, intercept = 0)+
  facet_wrap(time_of_day ~ weekend)+
  labs(title="Observed vs Predicted",
       x="Observed trips", 
       y="Predicted trips")+
  plotTheme

Show the code
week_predictions %>% 
    mutate(interval60 = map(data, pull, interval60),
           start_station = map(data, pull, start_station), 
           start_lat = map(data, pull, start_lat), 
           start_lon = map(data, pull, start_lon),
           dotw = map(data, pull, dotw) ) %>%
    select(interval60, start_station, start_lon, 
           start_lat, Observed, Prediction, Regression,
           dotw) %>%
    unnest() %>%
  filter(Regression == "FTime_Space_FE_timeLags_holidayLags_tripCountLags")%>%
  mutate(weekend = ifelse(dotw %in% weekend, "Weekend", "Weekday"),
         time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush")) %>%
  group_by(start_station, weekend, time_of_day, start_lon, start_lat) %>%
  summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
  ggplot(.)+
  geom_sf(data = phlTracts %>%
          st_transform(crs=crs), color = "grey", fill = "transparent")+
  geom_point(aes(x = start_lon, y = start_lat, color = MAE), 
             fill = "transparent", size = 1, alpha = 0.4)+
  scale_colour_viridis(direction = 1,
  discrete = FALSE, option = "D")+
  ylim(min(dat_census$start_lat), max(dat_census$start_lat))+
  xlim(min(dat_census$start_lon), max(dat_census$start_lon))+
  facet_grid(weekend~time_of_day)+
  labs(title="Mean Absolute Errors, Test Set")+
  mapTheme

Show the code
week_predictions %>%
    mutate(interval60 = map(data, pull, interval60),
           start_station = map(data, pull, start_station),
           start_lat = map(data, pull, start_lat),
           start_lon = map(data, pull, start_lon),
           dotw = map(data, pull, dotw),
           Percent_Taking_Public_Trans = map(data, pull, Percent_Taking_Public_Trans),
           Med_Inc = map(data, pull, Med_Inc),
           Percent_White = map(data, pull, Percent_White)) %>%
    select(interval60, start_station, start_lon,
           start_lat, Observed, Prediction, Regression,
           dotw, Percent_Taking_Public_Trans, Med_Inc, Percent_White) %>%
    unnest(cols = c(interval60, start_station, start_lon, start_lat, Observed, Prediction, dotw, Percent_Taking_Public_Trans, Med_Inc, Percent_White)) %>%
  filter(Regression == "FTime_Space_FE_timeLags_holidayLags_tripCountLags") %>%
  mutate(weekend = ifelse(dotw %in% weekend, "Weekend", "Weekday"),
         time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush")) %>%
  filter(time_of_day == "AM Rush") %>%
  group_by(start_station, Percent_Taking_Public_Trans, Med_Inc, Percent_White) %>%
  summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
  gather(-start_station, -MAE, key = "variable", value = "value")%>%
  ggplot(.)+
  #geom_sf(data = chicagoCensus, color = "grey", fill = "transparent")+
  geom_point(aes(x = value, y = MAE), alpha = 0.4)+
  geom_smooth(aes(x = value, y = MAE), method = "lm", se= FALSE)+
  facet_wrap(~variable, scales = "free")+
  labs(title="Errors as a function of socio-economic variables",
       y="Mean Absolute Error (Trips)")+
  plotTheme

5 Discussion

  • both space and time features important
  • model actually seems worse in white, wealthy neighborhoods–is this good or bad?
  • errors are still not randomly spatially distributed, meaning we can continue to engineer better features
  • that said, the predictive power is pretty solid

6 Conclusion

Conclude with how useful your algorithm is for the bike re-balancing plan.

Footnotes

  1. https://en.wikipedia.org/wiki/Indego↩︎

  2. https://www.rideindego.com/about/data/↩︎

  3. https://people.orie.cornell.edu/shane/pubs/BSOvernight.pdf↩︎